{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "import re\n",
      "import numpy as np\n",
      "from collections import defaultdict\n",
      "from operator import itemgetter"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "word_search_re = re.compile(r\"[\\w']+\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def load_model(model_filename):\n",
      "    model = defaultdict(lambda: defaultdict(float))\n",
      "    with open(model_filename) as inf:\n",
      "        for line in inf:\n",
      "            word, values = line.split(maxsplit=1)\n",
      "            word = eval(word)\n",
      "            values = eval(values)\n",
      "            model[word] = values\n",
      "    return model"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"part-00000\")\n",
      "model = load_model(model_filename)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "model[\"i\"][\"male\"], model[\"i\"][\"female\"]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "(409.7987003114851, 513.3231594734408)"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def nb_predict(model, document):\n",
      "    words = word_search_re.findall(document)\n",
      "    probabilities = defaultdict(lambda : 0)\n",
      "    for word in set(words):\n",
      "        probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-5))\n",
      "        probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-5))\n",
      "    # Now find the most likely gender\n",
      "    most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n",
      "    return most_likely_genders[0][0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "new_post = \"\"\" Every day should be a half day.  Took the afternoon off to hit the dentist, and while I was out I managed to get my oil changed, too.  Remember that business with my car dealership this winter?  Well, consider this the epilogue.  The friendly fellas at the Valvoline Instant Oil Change on Snelling were nice enough to notice that my dipstick was broken, and the metal piece was too far down in its little dipstick tube to pull out.  Looks like I'm going to need a magnet.   Damn you, Kline Nissan, daaaaaaammmnnn yooouuuu....   Today I let my boss know that I've submitted my Corps application.  The news has been greeted by everyone in the company with a level of enthusiasm that really floors me.     The back deck has finally been cleared off by the construction company working on the place.  This company, for anyone who's interested, consists mainly of one guy who spends his days cursing at his crew of Spanish-speaking laborers.  Construction of my deck began around the time Nixon was getting out of office.\n",
      "\"\"\""
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "nb_predict(model, new_post)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "'male'"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "testing_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"blogposts_testing\")\n",
      "testing_filenames = []\n",
      "for filename in os.listdir(testing_folder):\n",
      "    testing_filenames.append(os.path.join(testing_folder, filename))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def nb_predict_many(model, input_filename):\n",
      "    with open(input_filename) as inf:\n",
      "        # remove leading and trailing whitespace\n",
      "        for line in inf:\n",
      "            tokens = line.split()\n",
      "            actual_gender = eval(tokens[0])\n",
      "            blog_post = eval(\" \".join(tokens[1:]))\n",
      "            yield actual_gender, nb_predict(model, blog_post)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def nb_predict(model, document):\n",
      "    words = word_search_re.findall(document)\n",
      "    probabilities = defaultdict(lambda : 1)\n",
      "    for word in set(words):\n",
      "        probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-15))\n",
      "        probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-15))\n",
      "    # Now find the most likely gender\n",
      "    most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n",
      "    return most_likely_genders"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "y_true = []\n",
      "y_pred = []\n",
      "for testing_filename in testing_filenames:\n",
      "    for actual_gender, ratios in nb_predict_many(model, testing_filename):\n",
      "        predicted_gender = ratios[0][0]\n",
      "        y_true.append(actual_gender == \"female\")\n",
      "        y_pred.append(predicted_gender == \"female\")\n",
      "y_true = np.array(y_true, dtype='int')\n",
      "y_pred = np.array(y_pred, dtype='int')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.metrics import f1_score\n",
      "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n",
      "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))\n",
      "      \n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "f1=0.5540\n",
        "acc=0.5765\n"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "aws_model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"model_aws\")\n",
      "aws_model = load_model(aws_model_filename)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "y_true = []\n",
      "y_pred = []\n",
      "for testing_filename in testing_filenames:\n",
      "    for actual_gender, predicted_gender in nb_predict_many(aws_model, testing_filename):\n",
      "        predicted_gender = ratios[0][0]\n",
      "        y_true.append(actual_gender == \"female\")\n",
      "        y_pred.append(predicted_gender == \"female\")\n",
      "        #print(\"Actual: {0}\\tPredicted: {1}\".format(actual_gender, predicted_gender))\n",
      "        if len(y_true) > 500:\n",
      "            break\n",
      "y_true = np.array(y_true, dtype='int')\n",
      "y_pred = np.array(y_pred, dtype='int')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n",
      "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "f1=0.8144\n",
        "acc=0.8734\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "/usr/local/lib/python3.4/dist-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
        "  'precision', 'predicted', average, warn_for)\n"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(list(zip(y_true, y_pred))[:10])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]\n"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.metrics import confusion_matrix"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 19
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "confusion_matrix(y_true, y_pred)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 20,
       "text": [
        "array([[614,   0],\n",
        "       [ 89,   0]])"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}